Brownfield sites are sites which have previously been developed and built on, often for industrial use. Now they are disused, which makes them candidate sites for the construction of new homes. In particular, building on existing brownfield sites would reduce urban sprawl into the green belt (for some more details see e.g. http://www.bbc.co.uk/schools/gcsebitesize/geography/urban_environments/urbanisation_medcs_rev5.shtml).
Using multi-band satellite image of the Greater Manchester area and a pilot register of brownfield sites, we can extract the spectral signature of these sites. We can then train a classification algorithm on these data. The classifier could then be applied to satellite images from other cities, where there is no brownfield register available.
import gdal
# import osgeo
from osgeo import osr
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
%matplotlib inline
Image data was provided by Satellite Applications Catapult: https://sa.catapult.org.uk/
datapath = '/media/bjoern/data/datadive/SatelliteApplicationsCatapult/'
#Manchester
imname = 'SEN2_20170326_lat54lon217_T30UWE_vmsk_sharp_rad_srefdem_stdsref_tif.tif'
#Glasgow
#imname = 'SEN2_20170408_lat55lon371_T30UVG_vmsk_sharp_rad_srefdem_stdsref_tif.tif'
path = os.path.join(datapath,imname)
#read geotiff
ds = gdal.Open(path)
data = ds.ReadAsArray()
#some settings for zoom-in on Manchester
xmin,xmax = 4250,5750
ymin,ymax = 7000,8000
plt.rcParams['xtick.labelsize'] = 20 #40
plt.rcParams['ytick.labelsize'] = 20 #40
extent = (0,(xmax-xmin)/1e2,0,(ymax-ymin)/1e2)
#band5
arr=data[4]
plt.figure(figsize=(24,24))
plt.imshow(arr[ymin:ymax,xmin:xmax].astype('float')/float(arr.max()),cmap=plt.cm.gray,
vmin=0,vmax=0.2,extent=extent)
cb = plt.colorbar(shrink=0.5)
cb.set_label('normalized surface reflectance',fontsize=30)
plt.grid(ls='--')
plt.xlabel('[km]',fontsize=40)
plt.ylabel('[km]',fontsize=40)
plt.title('Greater Manchester (Sentinel2 - Band5)',fontsize=40)
#plt.savefig('Manchester_overview_Band5.png',dpi=300,bbox_inches='tight')
xmin1,xmax1 = 4800,5200
ymin1,ymax1 = 7200,7600
extent1 = (0,((xmax1-xmin1))/1e2,0,((ymax1-ymin1))/1e2)
#plt.figure(figsize=(24,24))
fig,axes = plt.subplots(1,4,figsize=(32,8),sharey=True)
for ax,band,cmap in zip(axes,[1,2,3,4],[plt.cm.Blues,plt.cm.Greens,plt.cm.Reds,plt.cm.gray]):
marr = data[band,ymin1:ymax1,xmin1:xmax1].astype('float')
print(marr.mean())
ax.imshow(marr,
cmap=cmap,
vmin=0,vmax=200,
extent=extent1)
#plt.colorbar()
ax.grid()
ax.set_xlabel('[km]',fontsize=40)
axes[0].set_ylabel('[km]',fontsize=40)
plt.subplots_adjust(wspace=0.05)
axes[1].set_title('Manchester centre',fontsize=40)
axes[2].set_title('Sentinel2 Bands 2-5',fontsize=40)
getting coordinates from geotiff e.g. https://stackoverflow.com/questions/2922532/obtain-latitude-and-longitude-from-a-geotiff-file
# get the existing coordinate system
old_cs= osr.SpatialReference()
old_cs.ImportFromWkt(ds.GetProjectionRef())
# create the new coordinate system
wgs84_wkt = """
GEOGCS["WGS 84",
DATUM["WGS_1984",
SPHEROID["WGS 84",6378137,298.257223563,
AUTHORITY["EPSG","7030"]],
AUTHORITY["EPSG","6326"]],
PRIMEM["Greenwich",0,
AUTHORITY["EPSG","8901"]],
UNIT["degree",0.01745329251994328,
AUTHORITY["EPSG","9122"]],
AUTHORITY["EPSG","4326"]]"""
new_cs = osr.SpatialReference()
new_cs.ImportFromWkt(wgs84_wkt)
# create a transform object to convert between coordinate systems
transform = osr.CoordinateTransformation(old_cs,new_cs)
#get the point to transform, pixel (0,0) in this case
width = ds.RasterXSize
height = ds.RasterYSize
gt = ds.GetGeoTransform()
gt
# left, x-axis east, x-axis north, upper, y-axis east, y-axis north
# i.e. origin is upper left
#get the coordinates in lat long
latlong = transform.TransformPoint(gt[0],gt[3])#minx,miny)
latlong
Manchester coordinates are: -2.244644 (long), 53.483959 (lat),
df = pd.read_csv('./greatermanchester_brownfieldregister_2016-07-01_rev1.1.csv')
df.head()
# missing coordinates
df.GeoX.isnull().sum()
#using inverse transform from above to translate lat,lon into UTM
InverseTransform = osr.CoordinateTransformation(new_cs,old_cs)
easting,northing,_ = InverseTransform.TransformPoint(df.GeoX[0],df.GeoY[0])
easting,northing
pixel coordinates, adapted from https://gis.stackexchange.com/questions/221292/retrieve-pixel-value-with-geographic-coordinate-as-input-with-gdal
xOrigin = gt[0]
yOrigin = gt[3]
pixelWidth = gt[1]
pixelHeight = gt[5] #removed minus for consistency
#testing for one point
point = (easting,northing)#list of X,Y coordinates
#pixel coordinates
col = int((point[0] - xOrigin) / pixelWidth)
row = int((point[1] - yOrigin) / pixelHeight) #changed order
print(row,col, arr[row,col])
coords = []
for i in range(df.shape[0]):
try:
easting,northing,_ = InverseTransform.TransformPoint(df.GeoX[i],df.GeoY[i])
point = (easting,northing)
col = int((point[0] - xOrigin) / pixelWidth)
row = int((point[1] - yOrigin) / pixelHeight)
except:
print('missing coordinate: skipping')
row,col = None,None
coords.append((row,col))
coords = np.array(coords)
#image boundaries (pixel size is 10m)
xmin2,xmax2 = 2750,6750
ymin2,ymax2 = 5000,9000
extent2 = (0,(xmax2-xmin2)/100,0,(ymax2-ymin2)/100)
fig,ax = plt.subplots(figsize=(24,24))
im = ax.imshow(arr[ymin2:ymax2,xmin2:xmax2]/arr.max(),
cmap=plt.cm.gray,vmin=0,vmax=1./5.,
extent=extent2)
cb = plt.colorbar(im,shrink=0.5)
cb.set_label('normalized surface reflectance',fontsize=30)
ax.grid(ls='--')
ax.set_title('Greater Manchester (Sentinel2 - Band5)',fontsize=40)
ax.set_xlabel('[km]',fontsize=30)
ax.set_ylabel('[km]',fontsize=30)
for (row,col) in coords:
if row is None: continue
ax.scatter((col-xmin2)/100,(ymax2-row)/100,color='r')
ax.set_xlim(0,(xmax2-xmin2)/100)
ax.set_ylim(0,(ymax2-ymin2)/100)
# plt.savefig('Manchester_brownfields.png',dpi=100,bbox_inches='tight')
pixel values
vals_all = []
for d in data:
vals = [ d[c[0],c[1]] for c in coords if c[0] is not None ]
vals_all.append(vals)
vals_all = np.array(vals_all)
vals_all
for i,vals in enumerate(vals_all):
plt.hist(vals,bins=50,label='Band {}'.format(i))
plt.legend()
x=np.arange(10)
plt.errorbar(x,vals_all.mean(axis=1),yerr=vals_all.std(axis=1)/np.sqrt(vals_all.shape[1]))
plt.xlabel('band',fontsize=20)
plt.ylabel('average surface reflectance',fontsize=20)
#compare with random points in image
coords_random = np.random.randint(low=0,high=ds.RasterXSize-1,size=coords.size).reshape(coords.shape)
vals_random = []
for d in data:
vals = [ d[c[0],c[1]] for c in coords_random if c[0] is not None ]
vals_random.append(vals)
vals_random = np.array(vals_random)
vals_random
plt.errorbar(x,vals_all.mean(axis=1),yerr=vals_all.std(axis=1)/np.sqrt(vals_all.shape[1]),
label = 'brownfield')
plt.errorbar(x,vals_random.mean(axis=1),yerr=vals_random.std(axis=1)/np.sqrt(vals_random.shape[1]),
label = 'random positions')
plt.xlabel('band',fontsize=20)
plt.ylabel('average surface reflectance',fontsize=20)
ax = plt.gcf().gca()
for va,vr in zip(vals_all.T,vals_random.T):
ax.plot(x,va,c='C0',lw=0.2,alpha=0.2)
ax.plot(x,vr,c='C1',lw=0.2,alpha=0.2)
ax.legend()
Brownfield sites have a spectral signature that is distinct from random sites. Most notably, they do not show an increase in the near infrared. This makes sense, I would expect brownfield sites to have little to no vegetation. Furthermore, brownfield sites have smaller scatter than random sites. This reflects the relatively uniform characteristics of brownfield sites, in contrast to the more diverse nature of random sites (housing, parks, agriculture,...).
# same with "magnitude" = log (reflectance)
log_vals_all = np.log10(vals_all)
log_vals_random = np.log10(vals_random)
plt.errorbar(x,log_vals_all.mean(axis=1),yerr=log_vals_all.std(axis=1)/np.sqrt(log_vals_all.shape[1]),
label = 'brownfield')
plt.errorbar(x,log_vals_random.mean(axis=1),yerr=log_vals_random.std(axis=1)/np.sqrt(log_vals_random.shape[1]),
label = 'random positions')
plt.xlabel('band',fontsize=20)
plt.ylabel('log (surface reflectance)',fontsize=20)
ax = plt.gcf().gca()
for va,vr in zip(log_vals_all.T,log_vals_random.T):
ax.plot(x,va,c='C0',lw=0.2,alpha=0.2)
ax.plot(x,vr,c='C1',lw=0.2,alpha=0.2)
ax.legend()
ax.set_ylim(1.5,2.8)
#create pandas data frames for better handling
valdf = pd.DataFrame(data=log_vals_all.T,columns=['Band{}'.format(i) for i in range(vals_all.shape[0])])
valdf = valdf.assign(kind='brownfield')
#also for randoms
randf = pd.DataFrame(data=log_vals_random.T,columns=['Band{}'.format(i) for i in range(vals_all.shape[0])])
randf = randf.assign(kind='random')
#stack both data frames
stackdf = pd.concat([valdf,randf],axis=0)
stackdf.head()
stackdf.tail()
g = sn.PairGrid(stackdf,hue='kind')
g.map_diag(plt.hist,bins=20,alpha=0.2)
g.map_offdiag(plt.scatter,s=6,alpha=0.2)
g.add_legend(fontsize=30)
# plt.savefig('log_reflectance_brownfield_vs_rand.png')
The different clustering in log(reflectance) space across the 10 different bands should make it possible to identify brownfield sites.
steps:
(in parts adapted from scikit-learn tutorial by Corran Webster, Enthought)
#replacing tag with numerical class
stackdf['kind'] = stackdf['kind'].map({'brownfield': 1, 'random': 0})
#convert to numpy array for sklearn input
stackarr = np.array(stackdf)
stackdat = stackarr[:,:-1]
stackclass = stackarr[:,-1]
stackclass.size
#randomize order (previously stacked data frame was sorted)
b = np.arange(stackclass.size)
np.random.shuffle(b)
features = stackdat[b]
target = stackclass[b]
(saving 20% for testing)
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(
features, target, test_size=0.2)
data_train.shape, data_test.shape, target_train.shape, target_test.shape
For some classifiers (e.g. Support Vector Machines), we need scale the data so that every feature has mean = 0 and unit variance; otherwise different features will not be weighted equally. Because we are using the logarithm of the surface reflectance, the range of the different features is already similar, but we will still do same basic preprocessing.
from sklearn.preprocessing import RobustScaler
#Robust scaler that uses only the central 50%
scaler = RobustScaler()
scaler.fit(data_train)
#rescale training data
scaled_train = scaler.transform(data_train)
scaled_df = pd.DataFrame(data=scaled_train, columns=stackdf.columns[:-1])
sn.pairplot(scaled_df)
Come of our bands are highly correlated, so we can check if we can reduce the dimensionality with a principal component analysis. This is by no means crucial with only 10 input dimensions, but can help significantly in higher-dimensional problems.
from sklearn.decomposition.pca import PCA
pca = PCA(n_components=0.99)
pca.fit(scaled_train)
print('number of components = {}'.format(pca.n_components_))
print('explained variance = : ',pca.explained_variance_ratio_)
In this case, PCA reduces the number of dimensions by a factor of 2, while retaining 99% of the variance in the data set.
#apply transformation to our scaled training data
reduced_train = pca.transform(scaled_train)
reduced_train.shape
reduced_df = pd.DataFrame(data=reduced_train,
columns=['PCA comp{}'.format(i) for i in range(pca.n_components_)])
sn.pairplot(reduced_df)
Here we train a simple SVM classifier on our rescaled and reduced training data.
from sklearn.svm import SVC
clf = SVC(probability=True)
clf.fit(reduced_train,target_train)
Crucially, we need to apply the same transformations to our test data set.
scaled_test = scaler.transform(data_test)
reduced_test = pca.transform(scaled_test)
predicted = clf.predict(reduced_test)
print('SVM classification accuracy = {:3.2f}'.format(clf.score(reduced_test,target_test)))
Not bad for such a simple model, but obviously still with a lot of scope for improvement.
We can also predict the probabilities of any given pixel belonging to a brownfield.
clf.predict_proba(reduced_test)[:10]
from sklearn.metrics import confusion_matrix
confusion_matrix(predicted,target_test)
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
pipeline = Pipeline([
('scaler', RobustScaler()),
('pca', PCA(n_components=0.99)),
('model', KNeighborsClassifier()),
])
pipeline.fit(data_train, target_train)
predicted_knn = pipeline.predict(data_test)
print('KNN classification accuracy = {:3.2f}'.format(pipeline.score(data_test,target_test)))
from sklearn.neural_network import MLPClassifier
pipeline2 = Pipeline([
('scaler', RobustScaler()),
('pca', PCA(n_components=0.99)),
('model', MLPClassifier()),
])
pipeline2.fit(data_train, target_train)
predicted_NN = pipeline2.predict(data_test)
print('Neural network classification accuracy = {:3.2f}'.format(pipeline2.score(data_test,target_test)))
A prediction accuracy of 80-85% sounds already pretty good. However, we have to keep in mind that such a classifier would be run on a satellite image with millions of pixels, and maybe 1000 actual brownfield sites. So this could leave us with more false positives than true brownfield sites.
However, so far we have only used the default settings of the various algorithms. Quite likely we could further improve the classification accuracy by tweaking the details of the algorithm. In particular, we should try to minimize the rate of false positives, i.e. non-brownfield sites that are mistakenly classified as brownfield sites.